Retrieving Sentinel 2 products

Hervé Guillon

13 July 2021

Loading libraries

# library(AIgeomorphologist)
devtools::load_all()
#> ℹ Loading AIgeomorphologist
#> Registered S3 method overwritten by 'geojsonsf':
#>   method        from   
#>   print.geojson geojson
library(sen2r)
#> Welcome to sen2r. To use the package from a GUI, launch
#>  > sen2r()
#> Documentation: https://sen2r.ranghetti.info
#> Help and bug report: https://github.com/ranghetti/sen2r/issues

Getting data

The labelled points data are stored in SSCT_data. The function to_spatial() transform the data.frame into sp object.

labelled_points <- SSCT_data %>% to_spatial() %>% sf::st_as_sf()
one_random_point <- labelled_points %>% dplyr::sample_n(1)
one_random_point
#> Simple feature collection with 1 feature and 2 fields
#> Geometry type: POINT
#> Dimension:     XY
#> Bounding box:  xmin: -121.5061 ymin: 36.21968 xmax: -121.5061 ymax: 36.21968
#> CRS:           +proj=longlat +datum=WGS84 +no_defs
#>           SiteID ward                   geometry
#> 1 SCC_CL_2_31656   10 POINT (-121.5061 36.21968)

Listing S2 tiles

First, we retrieve the list of the tile id for the S2 satellites over the spatial_extent of interest.

# add id_tiles to labelled_points
ind <- sapply(sf::st_intersects(labelled_points, sen2r::s2_tiles()), function(z) if (length(z)==0) NA_integer_ else z[1])
id_tiles <- sen2r::s2_tiles()[ind, ] %>% dplyr::pull(tile_id) %>% unique()
usethis::use_data(id_tiles, overwrite = TRUE)
id_tiles
#>  [1] "10TDM" "10TCM" "10TDL" "10TEL" "10TFM" "10TEM" "10TDK" "10SEJ" "10SDJ"
#> [10] "10TCL" "10SEH" "10SDH" "10TCK" "10TFK" "10TFL" "10TGL" "10TGM" "10SGJ"
#> [19] "10TEK" "10SFJ" "10SGH" "10TGK" "10SFH" "11SMT" "11SMS" "11SLT" "10SGD"
#> [28] "11SKU" "11SLU" "11SNS" "10SEG" "10SFE" "10SEF" "10SFF" "10SFG" "10SGE"
#> [37] "11SLB" "11SLA" "11SMU" "11SNT" "11SKC" "11SMV" "11SNV" "11SPU" "11SNU"
#> [46] "10SGG" "11SKB" "11SKA" "11SLV" "10SGF"

Listing S2 products

Second, we retrieve the entire list of the sensed tiles for the period of interest. This command can take a few hours to run.

search_results <- s2_list(
  tile = id_tiles, 
  time_interval = c(as.Date("2019-06-01"), as.Date("2020-10-01")),
  server = "gcloud",
  time_period = "full",
  level = "L2A",
  availability = "online"
  )
usethis::use_data(search_results, overwrite = TRUE)
search_results <- search_results %>% as.data.frame()
dim(search_results)
#> [1] 9442   12
colnames(search_results)
#>  [1] "name"               "url"                "mission"           
#>  [4] "level"              "id_tile"            "id_orbit"          
#>  [7] "sensing_datetime"   "ingestion_datetime" "clouds"            
#> [10] "footprint"          "uuid"               "online"
head(search_results)
#>                                                                name
#> 1 S2B_MSIL2A_20190601T183929_N0212_R070_T10TGL_20190601T222206.SAFE
#> 2 S2B_MSIL2A_20190601T183929_N0212_R070_T10TGM_20190601T222206.SAFE
#> 3 S2B_MSIL2A_20190601T183929_N0212_R070_T10SGJ_20190601T222206.SAFE
#> 4 S2B_MSIL2A_20190601T183929_N0212_R070_T10SFJ_20190601T222206.SAFE
#> 5 S2B_MSIL2A_20190601T183929_N0212_R070_T10SGH_20190601T222206.SAFE
#> 6 S2B_MSIL2A_20190601T183929_N0212_R070_T10TGK_20190601T222206.SAFE
#>                                                                                                                   url
#> 1 gs://gcp-public-data-sentinel-2/L2/tiles/10/T/GL/S2B_MSIL2A_20190601T183929_N0212_R070_T10TGL_20190601T222206.SAFE/
#> 2 gs://gcp-public-data-sentinel-2/L2/tiles/10/T/GM/S2B_MSIL2A_20190601T183929_N0212_R070_T10TGM_20190601T222206.SAFE/
#> 3 gs://gcp-public-data-sentinel-2/L2/tiles/10/S/GJ/S2B_MSIL2A_20190601T183929_N0212_R070_T10SGJ_20190601T222206.SAFE/
#> 4 gs://gcp-public-data-sentinel-2/L2/tiles/10/S/FJ/S2B_MSIL2A_20190601T183929_N0212_R070_T10SFJ_20190601T222206.SAFE/
#> 5 gs://gcp-public-data-sentinel-2/L2/tiles/10/S/GH/S2B_MSIL2A_20190601T183929_N0212_R070_T10SGH_20190601T222206.SAFE/
#> 6 gs://gcp-public-data-sentinel-2/L2/tiles/10/T/GK/S2B_MSIL2A_20190601T183929_N0212_R070_T10TGK_20190601T222206.SAFE/
#>   mission level id_tile id_orbit    sensing_datetime ingestion_datetime
#> 1      2B    2A   10TGL      070 2019-06-01 18:39:29               <NA>
#> 2      2B    2A   10TGM      070 2019-06-01 18:39:29               <NA>
#> 3      2B    2A   10SGJ      070 2019-06-01 18:39:29               <NA>
#> 4      2B    2A   10SFJ      070 2019-06-01 18:39:29               <NA>
#> 5      2B    2A   10SGH      070 2019-06-01 18:39:29               <NA>
#> 6      2B    2A   10TGK      070 2019-06-01 18:39:29               <NA>
#>      clouds
#> 1 25.182886
#> 2 59.057457
#> 3 13.727439
#> 4  6.254699
#> 5 13.039538
#> 6 13.173152
#>                                                                                                                                                                                                                                                                                                                                                                                  footprint
#> 1   POLYGON((-120.21088 40.527464112909335, -120.18938 40.59771807196067, -120.143906 40.74483797817248, -120.098236 40.89193972768891, -120.05307 41.03919338960453, -120.008575 41.18662490967887, -119.96184 41.33335054811086, -119.91577 41.48023008253367, -119.907196 41.5083620608322, -119.28943 41.49194363396281, -119.34445 40.50488583658978, -120.21088 40.527464112909335))
#> 2 POLYGON((-119.93434 41.421012232997306, -119.91577 41.48023008253367, -119.870865 41.62745286900263, -119.82495 41.77445123040062, -119.77861 41.921365826364166, -119.731964 42.068235973248576, -119.68565 42.21532441687911, -119.63928 42.36245497516326, -119.62691 42.401428368630995, -119.23686 42.39088066238852, -119.29443 41.40403474612011, -119.93434 41.421012232997306))
#> 3   POLYGON((-120.69324 38.92419035411651, -120.67732 38.9789127368281, -120.632034 39.1258089025245, -120.586655 39.27266815204012, -120.542816 39.41993885723559, -120.49855 39.56701196021211, -120.456085 39.71460393582964, -120.454025 39.72136180996339, -119.387634 39.69403311805837, -119.43793 38.70656329494251, -120.699356 38.73821864423846, -120.69324 38.92419035411651))
#> 4                                                                                                                            POLYGON((-120.747986 38.738403429976536, -120.72026 38.83136414410842, -120.67732 38.9789127368281, -120.632034 39.1258089025245, -120.586655 39.27266815204012, -120.56523 39.34466576762448, -120.58624 38.73593643111066, -120.747986 38.738403429976536))
#> 5                                                                                                                                                                                                                          POLYGON((-120.69653 38.82628170870513, -119.43355 38.79452724007538, -119.48164 37.80685703267152, -120.72763 37.83751218570364, -120.69653 38.82628170870513))
#> 6  POLYGON((-120.47946 39.633358287778705, -120.456085 39.71460393582964, -120.41147 39.861603967740805, -120.366745 40.00861927488247, -120.32405 40.156265176075856, -120.2796 40.30346029523268, -120.234436 40.45054114658088, -120.18938 40.59771807196067, -120.18408 40.61485723707699, -119.33966 40.59281342227294, -119.39224 39.60554763793313, -120.47946 39.633358287778705))
#>   uuid online
#> 1 <NA>   TRUE
#> 2 <NA>   TRUE
#> 3 <NA>   TRUE
#> 4 <NA>   TRUE
#> 5 <NA>   TRUE
#> 6 <NA>   TRUE

Determining optimal cloud cover

We can now analyze these search results so that we only download the tiles we need. One paramount parameter is the amount of clouds identified in the imagery. To define the threshold we defined min_SAFE as the minimum number of sensed tiles across all tiles for a given cloud threshold and sum_SAFE as the total of sensed tiles for a given cloud threshold.

search_stats <- search_results %>% dplyr::group_by(id_tile) %>%  dplyr::summarise(dplyr::across(dplyr::contains("clouds"), list(min = min, mean = mean))) %>% dplyr::summarise(clouds_min = min(clouds_min), clouds_mean = mean(clouds_mean))

clouds_stats <- data.frame(
    clouds_threshold = seq(ceiling(search_stats$clouds_min), ceiling(search_stats$clouds_mean), by = 0.1)
  ) %>% 
  dplyr::mutate(
    min_SAFE = sapply(clouds_threshold, function(clouds_threshold){
      search_results %>% 
        dplyr::group_by(id_tile) %>% 
        dplyr::group_map(~ {dplyr::filter(., clouds <= clouds_threshold) %>% nrow()}) %>%
        unlist() %>%
        min()
      }),
    sum_SAFE = sapply(clouds_threshold, function(clouds_threshold){
      search_results %>% 
        dplyr::group_by(id_tile) %>% 
        dplyr::group_map(~ {dplyr::filter(., clouds <= clouds_threshold) %>% nrow()}) %>%
        unlist() %>%
        sum()
      })
  )

The graph below shows the trade-off between incorporating more cloud: the minimum number of sensed tiles goes up, but so does the total number of sensed tiles. Each tile is roughly 1 GB.

p <- ggplot2::ggplot(clouds_stats %>% reshape2::melt(id.vars = c("clouds_threshold")), ggplot2::aes(x = clouds_threshold, y = value)) +
  ggplot2::geom_point() +
  ggplot2::facet_wrap(. ~ variable, scales = "free") +
  ggpubr::theme_pubclean()
plotly::ggplotly(p)

From these two graphs, we select the following threshold:

sensing_limit <- 30
clouds_stats %>% dplyr::filter(min_SAFE > sensing_limit) %>% head(1)
#>   clouds_threshold min_SAFE sum_SAFE
#> 1              5.1       31     4166
clouds_threshold <- clouds_stats %>% dplyr::filter(min_SAFE > sensing_limit) %>% head(1) %>% dplyr::pull(clouds_threshold)

which is appropriate as the cloud percentage remains low, at least 30 tiles are sensed for any tile, and less than 3 TB of SAFE data have to be downloaded.

The selection of the product for downloading is then done by:

selected_search_results <- search_results %>% dplyr::filter(clouds <= clouds_threshold)
dim(selected_search_results) 
#> [1] 4166   12

Sensing dates

Additionally, we can visualize the sensing dates with the following codes:

p <- ggplot2::ggplot(selected_search_results, ggplot2::aes(x = sensing_datetime, y = clouds, color = id_tile)) +
ggplot2::geom_point() +
ggpubr::theme_pubclean()
plotly::ggplotly(p)

Visual inspection of the above graph tile by tile does not indicate that there is a significant skew towards a given period for any tile. Example of a tile with discontinuous record is 10TGK. The tile with the lowest average cloud cover is 10SEF.

Number of sensed tiles

The following graph shows the distribution of the number of sensed tiles:

count_df <- data.frame(count = selected_search_results %>%
  dplyr::group_by(id_tile) %>% 
  dplyr::group_map(~ nrow(.)) %>% 
  unlist()
  )
ggplot2::ggplot(count_df, ggplot2::aes(x = count)) +
ggplot2::geom_density() +
ggplot2::labs(x = "number of sensed tiles") +
ggpubr::theme_pubclean()

Downloading products

Running s2_download() then allows to retrieve all the tile imagery provided that a safelist is passed as s2_prodlist.

class(search_results)
#> [1] "data.frame"
selected_search_results <- selected_search_results %>% as("safelist")
class(selected_search_results)
#> [1] "safelist"  "character"
s2_download(s2_prodlist = selected_search_results, outdir = "path/to/your/outdir")

Appendix

Description of Sentinel-2 MSI products

  1. Level-0 is compressed raw data. The Level-0 product contains all the information required to generate the Level-1 (and upper) product levels.
  2. Level-1
  • Level-1A is uncompressed raw data with spectral bands coarsely coregistered and ancillary data appended.
  • Level-1B data is radiometrically corrected radiance data. The physical geometric model is refined using available ground control points and appended to the product, but not applied.
  • Level-1C product provides orthorectified Top-Of-Atmosphere (TOA) reflectance, with sub-pixel multispectral registration. Cloud and land/water masks are included in the product.
  1. Level-2A product provides orthorectified Bottom-Of-Atmosphere (BOA) reflectance, with sub-pixel multispectral registration. A Scene Classification map (cloud, cloud shadows, vegetation, soils/deserts, water, snow, etc.) is included in the product.

from S2 MSI technical guide

SAFE format

The SAFE format has been designed to act as a common format for archiving and conveying data within ESA Earth Observation archiving facilities. The SAFE format wraps a folder containing image data in a binary data format and product metadata in XML. This flexibility allows the format to be scalable enough to represent all levels of SENTINEL products.

A SENTINEL-2 product refers to a directory folder that contains a collection of information (Figure 1). It includes:

  • a manifest.safe file which holds the general product information in XML
  • a preview image in JPEG2000 format
  • subfolders for measurement datasets including image data (granules/tiles) in GML-JPEG2000 format
  • subfolders for datastrip level information
  • a subfolder with auxiliary data (e.g. International Earth Rotation & Reference Systems (IERS) bulletin)
  • HTML previews

from S2 MSI data formats